library(tidyverse)
library(data.table)
library(dtplyr)
library(lubridate)
library(stringr)
library(zoo)
library(here)
library(sassy)
library(RPostgres)

# Open the log
lf <- log_open(file.path(here("output", "log"), "log_C_CompustatQuarterly.log"),
               autolog = T, show_notes = F)


# Send code to the log
log_code()

# 1 Read data from WRDS -------------------------------------------------------------------
sep("1 Read data from WRDS")

wrds <- dbConnect(Postgres(),
                  host='wrds-pgdata.wharton.upenn.edu',
                  port=9737,
                  dbname='wrds',
                  sslmode='require',
                  user='...'). # Input your user name on wrds

SQL_statement <- 
  "
  SELECT a.gvkey, a.datadate, a.fyearq, a.fqtr, a.datacqtr, a.datafqtr, a.acoq,
		     a.actq,a.ajexq,a.apq,a.atq,a.ceqq,a.cheq,a.cogsq,a.cshoq,a.cshprq,
		     a.dlcq,a.dlttq,a.dpq,a.drcq,a.drltq,a.dvpsxq,a.dvpq,a.dvy,a.epspiq,a.epspxq,a.fopty,
		     a.gdwlq,a.ibq,a.invtq,a.intanq,a.ivaoq,a.lcoq,a.lctq,a.loq,a.ltq,a.mibq,
		     a.niq,a.oancfy,a.oiadpq,a.oibdpq,a.piq,a.ppentq,a.ppegtq,a.prstkcy,a.prccq,
		     a.pstkq,a.rdq,a.req,a.rectq,a.revtq,a.saleq,a.seqq,a.sstky,a.txdiq,
		     a.txditcq,a.txpq,a.txtq,a.xaccq,a.xintq,a.xsgaq,a.xrdq, a.capxy
  FROM COMP.FUNDQ as a
	WHERE a.consol = 'C'
	AND a.popsrc = 'D'
	AND a.datafmt = 'STD'
	AND a.curcdq = 'USD'
	AND a.indfmt = 'INDL'
"

res <- dbSendQuery(conn = wrds, statement = SQL_statement)
df_CompustatQuarterly <- dbFetch(res)
dbClearResult(res)

# 2 Cleaning -------------------------------------------------------------------
sep("2 Cleaning")

# Convert the data frame to a data.table
setDT(df_CompustatQuarterly)

# Keep only the most recent data for each fiscal quarter
df_CompustatQuarterly <- df_CompustatQuarterly %>%
  arrange(datadate) %>%
  group_by(gvkey, fyearq, fqtr) %>%
  slice(n()) %>%
  ungroup() %>% 
  as.data.table()

# Data availability assumed
# Assume data available with a 3-month lag
df_CompustatQuarterly <- df_CompustatQuarterly %>%
  mutate(time_avail_m = as.Date(datadate) %m+% months(3)) %>% 
  mutate(time_avail_m = year(time_avail_m) * 100 + month(time_avail_m))

# Patch cases with earlier data availability
df_CompustatQuarterly <- df_CompustatQuarterly %>%
  mutate(rdq_ym = year(rdq)*100+month(rdq))  %>% 
  mutate(time_avail_m = ifelse(!is.na(rdq) & rdq_ym > time_avail_m, rdq_ym, time_avail_m))

# Drop cases with very late release
df_CompustatQuarterly <- df_CompustatQuarterly %>% 
  filter(!(as.numeric(rdq - datadate) > 180 & !is.na(rdq)))

# Keep the most recent information for each gvkey and time_avail_m
df_CompustatQuarterly <- df_CompustatQuarterly %>%
  arrange(datadate) %>%
  group_by(gvkey, time_avail_m) %>%
  slice(n()) %>%
  ungroup() %>% 
  as.data.table()

# Handling missing values assumed to be 0
zero_cols <- c("acoq", "actq", "apq", "cheq", "dpq", "drcq", "invtq", "intanq", "ivaoq", 
             "gdwlq", "lcoq", "lctq", "loq", "mibq", "prstkcy", "rectq", "sstky", "txditcq")

# Replace missing values with 0 in the specified columns using %>% (pipe)
df_CompustatQuarterly <- df_CompustatQuarterly %>%
  mutate(across(all_of(zero_cols), ~ifelse(is.na(.), 0, .)))

# Prepare year-to-date items
df_CompustatQuarterly <- df_CompustatQuarterly %>% arrange(gvkey, fyearq, fqtr)

ytdVars <- c("sstky", "prstkcy", "oancfy", "fopty")

for (v in ytdVars) {
  df_CompustatQuarterly <- df_CompustatQuarterly %>%
    group_by(gvkey, fyearq) %>%
    mutate(!!paste0(v, "q") := ifelse(fqtr == 1, .data[[v]], .data[[v]] - lag(.data[[v]]))) %>%
    ungroup() %>%
    as.data.table()
}

# save
fwrite(df_CompustatQuarterly, here("data", "Compustat", "q_QCompustat.csv"))
saveRDS(df_CompustatQuarterly, here("data", "Compustat", "q_QCompustat.RDS"))

# 3 Monthly version -------------------------------------------------------------------
sep("Monthly version")

# Expand to monthly

df_m_QCompustat <- df_CompustatQuarterly 
setDT(df_m_QCompustat)

df_m_QCompustat <- df_m_QCompustat[, .SD[rep(1, 3)], by = .(gvkey, time_avail_m)]
# Update 'time_avail_m' based on group-specific calculations
df_m_QCompustat[, datadate2 := datadate %m+% months(0:(.N - 1)), by = .(gvkey, time_avail_m)]
df_m_QCompustat[, datadate2 := as.Date(datadate2) %m+% months(3)] 
df_m_QCompustat[, time_avail_m := year(datadate2)*100+month(datadate2)]

# A few obervation have two rows in the same month after expanding, keep the most recent obs
df_m_QCompustat <- df_m_QCompustat %>%
  arrange(datadate) %>%
  group_by(gvkey, time_avail_m) %>%
  slice(n()) %>%
  ungroup() %>%
  as.data.table()

df_m_QCompustat <- df_m_QCompustat %>%
  #select(-tempTimeAvailM, -fyearq, -fqtr, -datacqtr, -datafqtr) %>%
  rename(datadate = datadate2,
         datadateq = datadate) 


# save
fwrite(df_m_QCompustat, here("data", "Compustat", "m_QCompustat.csv"))
saveRDS(df_m_QCompustat, here("data", "Compustat", "m_QCompustat.RDS"))

# Close log
log_close()

# View results
writeLines(readLines(lf))